# -*- coding: utf-8 -*-
import copy
import random
from zc_core.spiders.base import BaseSpider
from scrapy import Request
from scrapy.utils.project import get_project_settings
from scrapy.exceptions import IgnoreRequest

from zc_core.dao.sku_dao import SkuDao
from zc_core.dao.sku_pool_dao import SkuPoolDao
from zc_core.util.batch_gen import time_to_batch_no
from zc_core.util.http_util import retry_request
from zc_core.dao.batch_dao import BatchDao
from zc_core.util.done_filter import DoneFilter
from chdtp.utils.login import SeleniumLogin
from chdtp.rules import *


class FullSpider(BaseSpider):
    name = "full"
    custom_settings = {
        'CONCURRENT_REQUESTS': 24,
        # 'DOWNLOAD_DELAY': 0.05,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 24,
        'CONCURRENT_REQUESTS_PER_IP': 24,
        'COOKIES_ENABLED': False,
    }
    # 商品信息接口
    item_url = 'https://www.chdtp.com/hdsc/wzgl/detailScjtCpzsAction.action?spxxId={}'
    # 价格url
    price_url = 'https://www.chdtp.com/hdsc/wzgl/getJgDetialCpzsAction.action'

    def __init__(self, batchNo=None, *args, **kwargs):
        super(FullSpider, self).__init__(batchNo=batchNo, *args, **kwargs)
        # 创建批次记录
        BatchDao().create_batch(self.batch_no)
        # 避免重复采集
        self.done_filter = DoneFilter(self.batch_no, fields={'linkId': 1}, filter_key='linkId')

    def start_requests(self):
        settings = get_project_settings()
        sku_list = SkuDao().get_batch_sku_list(self.batch_no,
                                               fields={'_id': 1, 'batchNo': 1, 'supplierName': 1, 'offlineTime': 1})
        self.logger.info('全量: %s' % (len(sku_list)))
        dist_list = [x for x in sku_list if not self.done_filter.contains(x.get('_id'))]
        self.logger.info('目标：%s' % (len(dist_list)))
        random.shuffle(dist_list)
        for sku in dist_list:
            link_id = sku.get("_id")
            sp_name = sku.get("supplierName")
            # 避免无效采集
            offline_time = sku.get('offlineTime', 0)
            if offline_time > settings.get('MAX_OFFLINE_TIME', 2):
                self.logger.info('忽略: [%s][%s]', link_id, offline_time)
                continue
            # 避免重复采集
            if self.done_filter.contains(link_id) and not settings.get('FORCE_RECOVER', False):
                self.logger.info('已采: %s', link_id)
                continue

            yield Request(
                url=self.item_url.format(link_id),
                method='GET',
                meta={
                    'reqType': 'item',
                    'batchNo': self.batch_no,
                    "linkId": link_id,
                    "supplierName": sp_name,
                },
                headers={
                    'Connection': 'keep-alive',
                    'Cache-Control': 'max-age=0',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'zh-CN,zh;q=0.8'
                },
                callback=self.parse_item_data,
                errback=self.error_back,
            )

    def parse_item_data(self, response):
        meta = response.meta
        link_id = meta.get("linkId")
        item = parse_item_data(response)
        self.logger.info('商品: %s id: %s' % (link_id, item.get('skuId')))
        yield item
